/********************************************************************************
Discrimination in Multi-Phase Systems: Evidence from Child Protection

Created on: 12/28/2022
Last Modified on: 2/18/2023

Description: This program generates the datasets containing omitted payoffs and 
number of children per investigation, for investigations from 2008 to 2019.

Note that we have removed the file directory names from this program for 
confidentiality reasons. 
********************************************************************************/


**************************
**(0) SETUP
**************************
clear
set more off
macro drop all
capture log close
set seed 02042023

*Set directories
global cleandata 
global tmpdata 
global output 
global data_raw 

**************************
**(1) GEN THE NEW OUTCOMES NEEDED FOR THIS PAPER FROM RAW DATA
**************************
import delimited ${data_raw}allegations.csv, encoding(UTF-8) clear

**Flag allegations that did not result in formal investigations
gen cw_flag_screenedin=screeningdecision=="Accept and Assign for field investigation"
la var cw_flag_screenedin "Allegation was formally investigated"

**Flag children that were not the subject of the investigations (these are other children)
gen cw_flag_child_victim=child_role=="Alleged Victim (AV)"
la var cw_flag_child_victim "Child was alleged victim"
keep if cw_flag_child_victim==1 & cw_flag_screenedin==1

**Gen consistent investigation ids throughout the sample period 
rename complaint_date cw_date
la var cw_date "CW- Allegation Report Date"
gen complaint_date = date(cw_date,"MDY")
replace intake_id = investigation_caseid if complaint_date<20933
rename (intakechildpartyid intake_id) (vicid inv_caseid) 

**Keep and rename relevant variables
keep vicid county_name complaint_date inv_caseid allegationtypedesc cw_flag* relationtypeperptovictim finding catdesc intakeperppartyid cw_date county_name 

rename county_name cw_county
la var cw_county "CW- County of Investigation"

rename allegation cw_allegation
la var cw_allegation "CW- Allegation Type"

rename relationtypeperptovictim relationship 
la var relationship "CW - Relationship of alleged perp to child"

rename finding cw_sub
la var cw_sub "CW- Substantatiated"

*Gen foster care and substantiation indicators
*Note that these are only used for the purposes of creating omitted payoff outcomes
gen fc = catdesc=="1"
replace fc=0 if cw_sub=="No Preponderance" | cw_sub=="No Evidence" | cw_sub==""
gen preponderance = cw_sub=="Preponderance"

foreach x in fc preponderance {
	gegen tmp_`x' = max(`x'), by(vicid inv_caseid)
	drop `x'
	rename tmp_`x' `x'
}

*Gen indicators for allegation types 
tab cw_allegation, m
gen phyab = cw_allegation=="Physical Abuse"
gen neglect = cw_allegation=="Physical Neglect" | cw_allegation=="Medical Neglect" | cw_allegation=="Improper Supervision" | cw_allegation=="Failure To Protect"
gen maltreat = cw_allegation=="Maltreatment"

foreach x in phyab neglect maltreat {
	gegen tmp_`x' = max(`x'), by(vicid inv_caseid)
	drop `x'
	rename tmp_`x' `x'
}

gduplicates drop vicid inv_caseid, force 

*Generate subsequent maltreatment outcomes
foreach x in 6 {
	cap drop inv`x'm
	gen inv`x'm=.
	sort vicid complaint_date inv_caseid, stable 
	bysort vicid: replace inv`x'm = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+`x'*30)
	replace inv`x'm = 0 if inv`x'm ==.
	replace inv`x'm =. if complaint_date+`x'*30>date("11/20/2019","MDY")
	label var inv`x'm "Subject of another investigation within `x' months"
}

foreach x in 6 {
	cap drop inv`x'm_sub
	gen inv`x'm_sub=.
	sort vicid complaint_date inv_caseid, stable 
	bysort vicid: replace inv`x'm_sub = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+`x'*30)
	replace inv`x'm_sub = 0 if inv`x'm_sub ==.
	replace inv`x'm_sub = 0 if preponderance[_n+1]==0
	replace inv`x'm_sub =. if complaint_date+`x'*30>date("11/20/2019","MDY")
	label var inv`x'm_sub "Subject of another substantiated investigation within `x' months"
}

foreach x in 6 {
	cap drop inv`x'm_fc
	gen inv`x'm_fc=.
	sort vicid complaint_date inv_caseid, stable 
	bysort vicid: replace inv`x'm_fc = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+`x'*30)
	replace inv`x'm_fc = 0 if inv`x'm_fc ==.
	replace inv`x'm_fc = 0 if fc[_n+1]==0
	replace inv`x'm_fc =. if complaint_date+`x'*30>date("11/20/2019","MDY")
	label var inv`x'm_fc "Subject of another foster care placement within `x' months"
}

foreach x in phyab neglect {
	cap drop inv6m_`x'
	gen inv6m_`x'=.
	sort vicid complaint_date inv_caseid, stable 
	bysort vicid: replace inv6m_`x' = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+180)
	replace inv6m_`x' = 0 if inv6m_`x' ==.
	replace inv6m_`x' = 0 if `x'[_n+1]==0
	replace inv6m_`x' =. if complaint_date+180>date("11/20/2019","MDY")
	label var inv6m_`x' "Subject of another `x' allegation within 6 months"
}

foreach x in phyab neglect {
	cap drop inv6m_`x'_sub
	gen inv6m_`x'_sub=.
	sort vicid complaint_date inv_caseid, stable 
	bysort vicid: replace inv6m_`x'_sub = 1 if inrange(complaint_date[_n+1], complaint_date+1, complaint_date+180)
	replace inv6m_`x'_sub = 0 if inv6m_`x'_sub ==.
	replace inv6m_`x'_sub = 0 if `x'[_n+1]==0
	replace inv6m_`x'_sub = 0 if preponderance[_n+1]==0
	replace inv6m_`x'_sub =. if complaint_date+180>date("11/20/2019","MDY")
	label var inv6m_`x'_sub "Subject of another substantiated `x' allegation within 6 months"
}

keep vicid inv_caseid inv* cw_county

save "${tmpdata}inv_omitted_payoffs_qje.dta", replace 


*************************
**(2) GENERATE ADDITIONAL COVARIATE: NUMBER OF CHILDREN IN A GIVEN REFERRAL
*************************
insheet using "${data_raw}allegations.csv", clear
gen complaint_date2 = date(complaint_date,"MDY")
order complaint_date2 complaint_date
gsort complaint_date2 
replace intake_id = investigation_caseid if complaint_date2<20933

*Keep the dataset at the child X caseid level 
order intake_id intakechildpartyid
gduplicates drop intake_id intakechildpartyid, force

*Calculate number of children per referral 
bysort intake_id: gen num_children = _N

keep intakechildpartyid intake_id num_children
rename (intakechildpartyid intake_id) (vicid inv_caseid) 
save "${tmpdata}num_children_2008_2019", replace 